{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes - Trabalho\n",
    "\n",
    "## Questão 1\n",
    "\n",
    "Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). Este dataset de carros possui as seguintes features e classe:\n",
    "\n",
    "** Attributos **\n",
    "1. buying: vhigh, high, med, low\n",
    "2. maint: vhigh, high, med, low\n",
    "3. doors: 2, 3, 4, 5, more\n",
    "4. persons: 2, 4, more\n",
    "5. lug_boot: small, med, big\n",
    "6. safety: low, med, high\n",
    "\n",
    "** Classes **\n",
    "1. unacc, acc, good, vgood\n",
    "\n",
    "## Questão 2\n",
    "Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) \n",
    "\n",
    "## Questão 3\n",
    "\n",
    "Analise a acurácia dos dois algoritmos e discuta a sua solução."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import random\n",
    "import math\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "def separateByClass(dataset):\n",
    "    separated = {}\n",
    "    for _,row in dataset.iterrows():\n",
    "        vector = row\n",
    "        if (vector[-1] not in separated):\n",
    "            separated[vector[-1]] = []\n",
    "        separated[vector[-1]].append(vector)\n",
    "    return separated\n",
    "\n",
    "def mean(numbers):\n",
    "    return sum(numbers)/float(len(numbers))\n",
    " \n",
    "def stdev(numbers):\n",
    "    avg = mean(numbers)\n",
    "    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)\n",
    "    return math.sqrt(variance)\n",
    "\n",
    "def summarize(dataset):\n",
    "    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]\n",
    "    del summaries[-1]\n",
    "    return summaries\n",
    "\n",
    "def summarizeByClass(dataset):\n",
    "    separated = separateByClass(dataset)\n",
    "    summaries = {}\n",
    "    for classValue, instances in separated.items():\n",
    "        summaries[classValue] = summarize(instances)\n",
    "    return summaries\n",
    "\n",
    "def calculateProbability(x, mean, stdev):\n",
    "    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))\n",
    "    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent\n",
    "\n",
    "def calculateClassProbabilities(summaries, inputVector):\n",
    "    probabilities = {}\n",
    "    for classValue, classSummaries in summaries.items():\n",
    "        probabilities[classValue] = 1\n",
    "        for i in range(len(classSummaries)):\n",
    "            mean, stdev = classSummaries[i]\n",
    "            if stdev == 0.0:\n",
    "                stdev = 10000.0\n",
    "            x = inputVector[i]\n",
    "            probabilities[classValue] *= calculateProbability(x, mean, stdev)\n",
    "    return probabilities\n",
    "\n",
    "def predict(summaries, inputVector):\n",
    "    probabilities = calculateClassProbabilities(summaries, inputVector)\n",
    "    bestLabel, bestProb = None, -1\n",
    "    for classValue, probability in probabilities.items():\n",
    "        if bestLabel is None or probability > bestProb:\n",
    "            bestProb = probability\n",
    "            bestLabel = classValue\n",
    "    return bestLabel\n",
    "\n",
    "def getPredictions(summaries, testSet):\n",
    "    predictions = []\n",
    "    for _,row in testSet.iterrows():\n",
    "        result = predict(summaries, row)\n",
    "        predictions.append(result)\n",
    "    return predictions\n",
    "\n",
    "def getAccuracy(testSet, predictions):\n",
    "    correct = 0\n",
    "    for i in range(len(testSet)):\n",
    "        if testSet.iloc[i] == predictions[i]:\n",
    "            correct += 1\n",
    "    return (correct/float(len(testSet))) * 100.0\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>vhigh</th>\n",
       "      <th>vhigh.1</th>\n",
       "      <th>2</th>\n",
       "      <th>2.1</th>\n",
       "      <th>small</th>\n",
       "      <th>low</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>611</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1599</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1011</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1612</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      vhigh  vhigh.1  2  2.1  small  low  Class\n",
       "611       0        0  2    2      2    1      2\n",
       "1599      1        2  3    0      0    2      2\n",
       "1011      2        0  1    1      1    2      2\n",
       "733       0        2  3    0      1    0      2\n",
       "1612      1        2  3    2      2    0      1"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = 'carData.csv'\n",
    "dataset = pd.read_csv(filename)\n",
    "for i in range(0, dataset.shape[1]):\n",
    "    dataset.iloc[:,i] = LabelEncoder().fit_transform(dataset.iloc[:,i])\n",
    "    \n",
    "Y = dataset.unacc.copy()\n",
    "del dataset['unacc']\n",
    "X = dataset\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3)\n",
    "\n",
    "X_train_sk = X_train.copy()\n",
    "\n",
    "idx = X_train.shape[1]\n",
    "new_col = Y_train \n",
    "X_train.insert(loc=idx, column='Class', value=new_col)\n",
    "\n",
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "958     0\n",
       "1309    0\n",
       "23      2\n",
       "1598    2\n",
       "1525    1\n",
       "Name: unacc, dtype: int64"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summaries = summarizeByClass(X_train)\n",
    "Y_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = getPredictions(summaries, X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hand-made Gaussian NB\n",
      "Accuracy =  66.85934489402698\n",
      "\n",
      "Classification Report:\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      unacc       0.48      0.75      0.58       126\n",
      "        acc       0.24      1.00      0.38        18\n",
      "       good       0.96      0.65      0.77       361\n",
      "      vgood       0.00      0.00      0.00        14\n",
      "\n",
      "avg / total       0.79      0.67      0.69       519\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/vilmabezerraalves/miniconda3/envs/DataScience/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "acc = getAccuracy(Y_test, y_pred)\n",
    "print('Hand-made Gaussian NB\\nAccuracy = ',acc)\n",
    "print(\"\\nClassification Report:\")\n",
    "print(classification_report(y_true=Y_test, y_pred=y_pred, target_names=[\"unacc\", \"acc\", \"good\", \"vgood\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SciKitLearn Gaussian NB\n",
      "Accuracy =  60.886319845857415\n",
      "\n",
      "Classification Report:\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      unacc       0.39      0.07      0.12       126\n",
      "        acc       0.00      0.00      0.00        18\n",
      "       good       0.84      0.81      0.83       361\n",
      "      vgood       0.09      1.00      0.17        14\n",
      "\n",
      "avg / total       0.68      0.61      0.61       519\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/vilmabezerraalves/miniconda3/envs/DataScience/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "naive = GaussianNB()\n",
    "naive.fit(X_train_sk.values, Y_train.values)\n",
    "\n",
    "y_pred_sk = naive.predict(X_test.values)\n",
    "accSK = getAccuracy(Y_test, y_pred_sk)\n",
    "print('SciKitLearn Gaussian NB\\nAccuracy = ',accSK)\n",
    "print(\"\\nClassification Report:\")\n",
    "print(classification_report(y_true=Y_test, y_pred=y_pred_sk, target_names=[\"unacc\", \"acc\", \"good\", \"vgood\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
